Take home Exercise 3 - 4 Feb 2022

Animation of graphs in R

Frostbear https://sg.linkedin.com/in/farahfoo (SMU Masters in IT business (Fintech and Analytics))https://scis.smu.edu.sg/master-it-business
2022-02-05

Context of Exercise

Using previous age-sex pyramid based on 2021 data, to apply appropriate interactivity and animation methods to design an age-sex pyramid based data visualisation to show the changes of demographic structure of Singapore by age cohort and gender between 2000-2020 at planning area level. The data set used is entitle Singapore Residents by Planning Area / Subzone, Age Group, Sex and Type of Dwelling, June 2000-2010 and Singapore Residents by Planning Area / Subzone, Age Group, Sex and Type of Dwelling, June 2011-2020, from Department of Statistics home page.

Installing and loading packages required for Age-Sex pyramid

packages = c('tidyverse', 'readxl', 'ggthemes')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Loading data using _csv command

pop_data <- read_csv("data/respopagesextod2021.csv")
glimpse (data)
function (..., list = character(), package = NULL, lib.loc = NULL, 
    verbose = getOption("verbose"), envir = .GlobalEnv, overwrite = TRUE)  

Summarising data

summary_sex <- pop_data %>%
  group_by(AG, Sex) %>%
  summarise(Pop = sum(Pop)) %>%
  ungroup()

head (summary_sex,5)
# A tibble: 5 x 3
  AG       Sex        Pop
  <chr>    <chr>    <dbl>
1 0_to_4   Females  87730
2 0_to_4   Males    91400
3 10_to_14 Females  97980
4 10_to_14 Males   102330
5 15_to_19 Females 100190

Sorting Age group

To ensure the age group is sorted into the ideal sequence, we set the order we want.

order <- c("0_to_4", "5_to_9", "10_to_14", "15_to_19", "20_to_24", "25_to_29", "30_to_34", "35_to_39", "40_to_44", "45_to_49", "50_to_54", "55_to_59", "60_to_64", "65_to_69", "70_to_74", "75_to_79", "80_to_84", "85_to_89", "90_and_over")

summary_sex1 <- summary_sex %>%
  mutate(AG =  factor(AG, levels = order)) %>%
  arrange(AG)

head(summary_sex1,5)
# A tibble: 5 x 3
  AG       Sex        Pop
  <fct>    <chr>    <dbl>
1 0_to_4   Females  87730
2 0_to_4   Males    91400
3 5_to_9   Females  97120
4 5_to_9   Males   102390
5 10_to_14 Females  97980

Age-Sex pyramid for 1 year = 2015

ggplot(summary_sex1, aes(x = AG)) +

  geom_bar(data=summary_sex1[summary_sex1$Sex=="Males",], aes(y=Pop*-1), stat="identity", fill="blue") +
  
  geom_bar(data=summary_sex1[summary_sex1$Sex=="Females",], aes(y=Pop), stat="identity", fill="pink") +
  
  geom_hline(yintercept=0, colour="white", lwd=1)+
coord_flip () +
  
scale_y_continuous(breaks = seq(-160000,160000,40000)) +
  labs(y="Population", x="Gender") +
  ggtitle("                        Male                                                Female")

Building base graph for 20 years of population data

For animation of population across time, data source can be found here at singstat website

year2000 <- read_csv("data/respopagesextod2000to2010.csv")
year2011 <- read_csv("data/respopagesextod2011to2020.csv") 

head (year2000,3)
# A tibble: 3 x 7
  PA         SZ        AG     Sex   TOD                      Pop  Time
  <chr>      <chr>     <chr>  <chr> <chr>                  <dbl> <dbl>
1 Ang Mo Kio Cheng San 0_to_4 Males HDB 1- and 2-Room Fla~    20  2000
2 Ang Mo Kio Cheng San 0_to_4 Males HDB 3-Room Flats         480  2000
3 Ang Mo Kio Cheng San 0_to_4 Males HDB 4-Room Flats         220  2000
head (year2011,3)
# A tibble: 3 x 7
  PA         SZ                     AG     Sex   TOD         Pop  Time
  <chr>      <chr>                  <chr>  <chr> <chr>     <dbl> <dbl>
1 Ang Mo Kio Ang Mo Kio Town Centre 0_to_4 Males HDB 1- a~     0  2011
2 Ang Mo Kio Ang Mo Kio Town Centre 0_to_4 Males HDB 3-Ro~    10  2011
3 Ang Mo Kio Ang Mo Kio Town Centre 0_to_4 Males HDB 4-Ro~    30  2011
# Since columns are the same, we can combine the 2 files into 1 file for processing

combined <- rbind(year2000,year2011)
unique(combined$Time)
 [1] 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012
[14] 2013 2014 2015 2016 2017 2018 2019 2020
# write_csv(combined, "combined.csv")

# in the Time column, there are only numbers, hence the row header was not copied into the data

Sorting Age group

To ensure the age group is sorted into the ideal sequence, we set the order we want.

order <- c("0_to_4", "5_to_9", "10_to_14", "15_to_19", "20_to_24", "25_to_29", "30_to_34", "35_to_39", "40_to_44", "45_to_49", "50_to_54", "55_to_59", "60_to_64", "65_to_69", "70_to_74", "75_to_79", "80_to_84", "85_to_89", "90_and_over")

combined <- combined %>%
  mutate(AG =  factor(AG, levels = order)) %>%
  arrange(AG)

head(combined,5)
# A tibble: 5 x 7
  PA         SZ        AG     Sex   TOD                      Pop  Time
  <chr>      <chr>     <fct>  <chr> <chr>                  <dbl> <dbl>
1 Ang Mo Kio Cheng San 0_to_4 Males HDB 1- and 2-Room Fla~    20  2000
2 Ang Mo Kio Cheng San 0_to_4 Males HDB 3-Room Flats         480  2000
3 Ang Mo Kio Cheng San 0_to_4 Males HDB 4-Room Flats         220  2000
4 Ang Mo Kio Cheng San 0_to_4 Males HDB 5-Room and Execut~    80  2000
5 Ang Mo Kio Cheng San 0_to_4 Males HUDC Flats (excluding~     0  2000
combined4 <- combined %>% 
  filter(PA == c("Ang Mo Kio", "Marine Parade", "Punggol", "Bedok")) 

Summarising data by Age Group, Sex and Time for 4 Planning areas “Ang Mo Kio”, “Marine Parade”, “Punggol”, “Bedok”

To plot the graph over the different years, we need to call out the Time field as a column (variable)

summary_sex_20 <- combined4 %>%
  group_by(AG, Sex, Time, PA) %>%
  summarise(Pop = sum(Pop)) %>%
  ungroup()

head (summary_sex_20,5)
# A tibble: 5 x 5
  AG     Sex      Time PA              Pop
  <fct>  <chr>   <dbl> <chr>         <dbl>
1 0_to_4 Females  2000 Ang Mo Kio      290
2 0_to_4 Females  2000 Bedok          1620
3 0_to_4 Females  2000 Marine Parade   400
4 0_to_4 Females  2000 Punggol           0
5 0_to_4 Females  2001 Ang Mo Kio      250

Plotting double geom_bar Age-sex pyramid for 20 years in 4 areas “Ang Mo Kio”, “Marine Parade”, “Punggol”, “Bedok”

Using the individual Age-sex pyramid from above (plotted for year 2021), we re-use the code to plot out 20 pyramid graphs, 1 graph for each year.

ggplot(summary_sex_20, aes(x=PA)) +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Males",], aes(y=Pop*-1), stat="identity", fill="blue") +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Females",], aes(y=Pop), stat="identity", fill="pink") +
  geom_hline(yintercept=0, colour="white", lwd=1)+
  
coord_flip () +
  
scale_y_continuous(breaks = seq(-160000,160000,40000), labels = function(v) ifelse(abs(v)>=1000,paste0(abs(v)/1000, "K"), abs(v))) +
  
  labs(title = "Age-Sex Population Pyramid, in 4 key areas 2021", 
   caption = 'Data Source: Department of Statistics (June 2021)',
   y = "Population", x = "Gender") + 
  
  theme_bw() +
   theme(legend.position = "none")+
  theme(plot.title = element_text(size=16))+
  theme(plot.subtitle = element_text(size=12))+
  
facet_wrap(. ~ `Time`,ncol=4)

It is clear from the 20 graphs displayed, that the difference in population year on year is not clear. To show more clarity, we use the year as base to transition the graph in 1 frame in the next section.

Using gganimate

but first, we enhance the graph by

adding title caption theme find out the maximum and minimum values of the population to set the chart axis to ensure all the values will be captured properly.

Activating gganimate as it will be used for the animation of the age-sex pyramid over the 20 years

We call out the package required which is ggaminate.

packages = c('gganimate')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Then we find out the max and min values of the population set.

max(summary_sex_20$Pop)
[1] 7230
min(summary_sex_20$Pop)
[1] 0

Improving the existing code by adding the range limits, title, subtitle and theme.

SG20 <- ggplot(summary_sex_20, aes(x=AG,colour=Sex,fill=Sex)) +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Males",], aes(y=Pop*-1), stat="identity") +
  geom_bar(data=summary_sex_20[summary_sex_20$Sex=="Females",], aes(y=Pop), stat="identity") +
  geom_hline(yintercept=0, colour="white", lwd=1) +
  
coord_flip() +
  
scale_y_continuous(limits = c(-10000, 10000), n.breaks = 10, labels = function(v) ifelse(abs(v)>= 1000,paste0(abs(v)/1000, "K"), abs(v))) +
  
  labs(title = "Singapore Age-Sex Population Pyramid for 20 years in 4 key areas",
    subtitle = 'Year: "{round(frame_time, 0)}"',
    caption = 'Data Source: Department of Statistics (June 2000 to June 2020)',
  y = 'Male and Female Population',
  x = 'Age Group') +
  
  theme_bw () +
   theme(legend.text = element_text(size=12))+
  theme(plot.title = element_text(size=16))+
  theme(plot.subtitle = element_text(size=10)) +
  facet_grid(PA ~ .)

SG20

.. and we animate the age-sex pyramid to see growth in the 4 areas

library(gganimate)

SG20_animated <- SG20 +
  scale_y_continuous(limits = c(-8000, 8000), n.breaks = 10)+
transition_time(Time) +
ease_aes('linear')

SG20_animated

Doing Interactive plots

See below for 2 types of interactive plotting (datatable and 2 interactive graphs). Loading packages for interactive plots

packages = c('tidyverse', 'readxl', 'ggthemes', 'ggiraph', 'plotly', 
             'gganimate', 'patchwork', 'DT', 'gifski', 'gapminder')

for(p in packages){
  if(!require(p, character.only = T)){
    install.packages(p)
  }
  library(p, character.only = T)
}

Doing 2 interactive graphs using the same data set

We create a new data set, and then use the function highlight_key to link both data set to enable the interactive highlight across 2 graphs

data <- summary_sex_20 %>% 
  group_by(Time, PA) %>% 
summarise (Totalpopulation = sum(Pop))

d <-highlight_key(data)
           
p1 <- ggplot (data = d,
        aes(x = Time, y = Totalpopulation, fill = PA)) + 
  geom_col () +
  labs(title = 'Total population that has stayed in the planning area across 20 years')

p2 <- ggplot (data = d,
        aes(x = Time, y = PA, fill = Time)) + 
  geom_col () +
  labs(title = 'Total population that has stayed in the planning area across 20 years')

Putting the 2 graphs beside each other

you can use the graphs to investigate when each area themselves has the highest population

subplot (ggplotly (p1),
         ggplotly (p2))

or you can see the data table below the chart for data information

gg <- highlight(ggplotly(p1),
                "plotly_selected")

crosstalk::bscols(gg,
                  DT::datatable(d),
                  widths = 15)

Animating graph by Singapore Planning Area and age group`

To illustrate the population in the 4 areas across the age group for 20 years (2000 - 2020).

animate1 <- ggplot (summary_sex_20, aes(x = AG, y = Pop/1000))+ 
    geom_col () +
  coord_flip() +
ggtitle('Planning area: {closest_state}') +
  labs (x = 'AG',
        y = 'Population (thousand)') +
transition_states (PA) +
  ease_aes('linear') +
  enter_fade() +
  exit_fade()

animate(animate1,fps=3)